This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
options(scipen = 999)
library(tidyverse)
library(ggplot2)
library(plotly)
library(data.table)
library(lubridate)
library(dplyr)
library(corrplot)
library(leaflet)
#Get only the Summary data for some initial analysis
summary_raw_data<-read_csv ('data/claim_summary_v1.csv' )
Parsed with column specification:
cols(
.default = col_double(),
esco_id = [31mcol_character()[39m,
bene_hic_num = [31mcol_character()[39m,
claim_first_dialysis_date = [31mcol_character()[39m,
claim_last_dialysis_date = [31mcol_character()[39m
)
See spec(...) for full column specifications.
summary_wds<-summary_raw_data
#Get the data with all the required fields for analysis
#claim_detail_raw_data<-fread("data/claim_details.csv",sep = "|",fill = T)
#detail_raw_data<-read_csv ('data/claim_details.csv' )
locations<-read_csv('data/ESCo_LOCATIONS_lo.csv' )
Parsed with column specification:
cols(
location_id = [31mcol_character()[39m,
short_name = [31mcol_character()[39m,
location_address_1 = [31mcol_character()[39m,
location_address_2 = [31mcol_character()[39m,
location_city = [31mcol_character()[39m,
location_state = [31mcol_character()[39m,
location_zip_code = [32mcol_double()[39m,
latitude = [32mcol_double()[39m,
longitude = [32mcol_double()[39m
)
INITIAL ANALYSIS WITH THE SUMMARY DATA ONLY.
summary_wds %>%
select (-esco_id,-bene_hic_num,-esco_aligned_flag) %>%
filter(patient_id == '811454') %>%
arrange(dos_year,dos_month)
NA
str(summary_wds)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 10364 obs. of 35 variables:
$ esco_id : chr "E0050" "E0050" "E0050" "E0050" ...
$ bene_hic_num : chr "161446232A" "161446232A" "161446232A" "161480276A" ...
$ patient_id : num 811454 811454 811454 814249 814249 ...
$ dos_year : num 2018 2018 2018 2017 2017 ...
$ dos_month : num 1 8 11 2 5 1 4 4 2 8 ...
$ claim_first_dialysis_date: chr "2018-01-03 00:00:00.000" "2018-08-01 00:00:00.000" "2018-11-02 00:00:00.000" "2017-02-01 00:00:00.000" ...
$ claim_last_dialysis_date : chr "2018-01-31 00:00:00.000" "2018-08-31 00:00:00.000" "2018-11-30 00:00:00.000" "2017-02-06 00:00:00.000" ...
$ dci_claims : num 1 0 0 1 1 1 1 1 1 0 ...
$ non_dci_claims : num 3 7 11 3 1 1 6 67 1 3 ...
$ payment : num 2512 2853 3498 2272 2617 ...
$ part_a : num 2053 2323 2160 2068 2432 ...
$ part_b_phys : num 459 530 1300 204 185 ...
$ part_b_dme : num 0 0 39.1 0 0 ...
$ inpatient : num 0 0 0 0 0 ...
$ outpatient_dialysis : num 2053 2323 2160 2068 2432 ...
$ outpatient_er : num 0 0 0 0 0 ...
$ outpatient_other : num 0 0 0 0 0 ...
$ hha : num 0 0 0 0 0 0 0 0 0 0 ...
$ snf : num 0 0 0 0 0 ...
$ hospice : num 0 0 0 0 0 0 0 0 0 0 ...
$ full_encounter : num 0 0 0 0 0 0 0 0 0 0 ...
$ phys_neph : num 222 222 222 185 185 ...
$ phys_hosp : num 0 0 0 0 0 ...
$ phys_ed : num 0 0 0 0 0 ...
$ vasc_access : num 0 0 916 0 0 ...
$ ambulance : num 0 0 0 0 0 ...
$ phys_other : num 236.9 307.9 162.1 19.2 0 ...
$ dme : num 0 0 39.1 0 0 ...
$ esco_aligned_flag : num 1 1 1 1 1 1 1 1 1 1 ...
$ inpatient_fluid : num 0 0 0 0 0 0 0 0 0 0 ...
$ outpatient_er_fluid : num 0 0 0 0 0 0 0 0 0 0 ...
$ part_a_other_fluid : num 0 0 0 0 0 0 0 0 0 0 ...
$ inpatient_access : num 0 0 0 0 0 0 0 0 0 0 ...
$ outpatient_er_access : num 0 0 0 0 0 0 0 0 0 0 ...
$ part_a_other_access : num 0 0 0 0 0 0 0 0 0 0 ...
- attr(*, "spec")=
.. cols(
.. esco_id = [31mcol_character()[39m,
.. bene_hic_num = [31mcol_character()[39m,
.. patient_id = [32mcol_double()[39m,
.. dos_year = [32mcol_double()[39m,
.. dos_month = [32mcol_double()[39m,
.. claim_first_dialysis_date = [31mcol_character()[39m,
.. claim_last_dialysis_date = [31mcol_character()[39m,
.. dci_claims = [32mcol_double()[39m,
.. non_dci_claims = [32mcol_double()[39m,
.. payment = [32mcol_double()[39m,
.. part_a = [32mcol_double()[39m,
.. part_b_phys = [32mcol_double()[39m,
.. part_b_dme = [32mcol_double()[39m,
.. inpatient = [32mcol_double()[39m,
.. outpatient_dialysis = [32mcol_double()[39m,
.. outpatient_er = [32mcol_double()[39m,
.. outpatient_other = [32mcol_double()[39m,
.. hha = [32mcol_double()[39m,
.. snf = [32mcol_double()[39m,
.. hospice = [32mcol_double()[39m,
.. full_encounter = [32mcol_double()[39m,
.. phys_neph = [32mcol_double()[39m,
.. phys_hosp = [32mcol_double()[39m,
.. phys_ed = [32mcol_double()[39m,
.. vasc_access = [32mcol_double()[39m,
.. ambulance = [32mcol_double()[39m,
.. phys_other = [32mcol_double()[39m,
.. dme = [32mcol_double()[39m,
.. esco_aligned_flag = [32mcol_double()[39m,
.. inpatient_fluid = [32mcol_double()[39m,
.. outpatient_er_fluid = [32mcol_double()[39m,
.. part_a_other_fluid = [32mcol_double()[39m,
.. inpatient_access = [32mcol_double()[39m,
.. outpatient_er_access = [32mcol_double()[39m,
.. part_a_other_access = [32mcol_double()[39m
.. )
summary(summary_wds)
esco_id bene_hic_num patient_id dos_year dos_month claim_first_dialysis_date claim_last_dialysis_date dci_claims non_dci_claims payment
Length:10364 Length:10364 Min. : 30111 Min. :2017 Min. : 1.000 Length:10364 Length:10364 Min. :0.0000 Min. : 0.0 Min. : 34.18
Class :character Class :character 1st Qu.:798197 1st Qu.:2017 1st Qu.: 4.000 Class :character Class :character 1st Qu.:1.0000 1st Qu.: 3.0 1st Qu.: 2664.22
Mode :character Mode :character Median :806827 Median :2018 Median : 7.000 Mode :character Mode :character Median :1.0000 Median : 6.0 Median : 3284.65
Mean :711615 Mean :2018 Mean : 6.521 Mean :0.9818 Mean : 11.7 Mean : 6916.75
3rd Qu.:812964 3rd Qu.:2018 3rd Qu.: 9.000 3rd Qu.:1.0000 3rd Qu.: 14.0 3rd Qu.: 6039.45
Max. :910902 Max. :2018 Max. :12.000 Max. :5.0000 Max. :138.0 Max. :139204.35
part_a part_b_phys part_b_dme inpatient outpatient_dialysis outpatient_er outpatient_other hha snf hospice
Min. : 13.17 Min. : 0.0 Min. : 0.00 Min. : 0 Min. : 0 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.00
1st Qu.: 2321.80 1st Qu.: 224.9 1st Qu.: 0.00 1st Qu.: 0 1st Qu.:2037 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.00
Median : 2730.59 Median : 369.4 Median : 0.00 Median : 0 Median :2323 Median : 0.0 Median : 0.0 Median : 0.0 Median : 0.0 Median : 0.00
Mean : 5898.76 Mean : 957.7 Mean : 60.23 Mean : 2601 Mean :2299 Mean : 106.7 Mean : 391.9 Mean : 160.5 Mean : 323.2 Mean : 16.37
3rd Qu.: 4754.09 3rd Qu.: 1002.8 3rd Qu.: 4.33 3rd Qu.: 0 3rd Qu.:2609 3rd Qu.: 0.0 3rd Qu.: 115.2 3rd Qu.: 0.0 3rd Qu.: 0.0 3rd Qu.: 0.00
Max. :131619.23 Max. :22145.9 Max. :17581.02 Max. :131619 Max. :7283 Max. :26135.5 Max. :25749.6 Max. :9011.7 Max. :22185.7 Max. :7032.21
full_encounter phys_neph phys_hosp phys_ed vasc_access ambulance phys_other dme esco_aligned_flag inpatient_fluid outpatient_er_fluid
Min. :0 Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0 Min. : 0.0 Min. : 0.00 Min. : 0.00 Min. :1 Min. : 0.0 Min. : 0.000
1st Qu.:0 1st Qu.:180.8 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 0 1st Qu.: 0.0 1st Qu.: 17.26 1st Qu.: 0.00 1st Qu.:1 1st Qu.: 0.0 1st Qu.: 0.000
Median :0 Median :213.8 Median : 0.0 Median : 0.00 Median : 0 Median : 0.0 Median : 111.42 Median : 0.00 Median :1 Median : 0.0 Median : 0.000
Mean :0 Mean :189.0 Mean : 259.2 Mean : 28.58 Mean : 75 Mean : 100.7 Mean : 305.26 Mean : 60.23 Mean :1 Mean : 197.4 Mean : 7.179
3rd Qu.:0 3rd Qu.:220.4 3rd Qu.: 0.0 3rd Qu.: 0.00 3rd Qu.: 0 3rd Qu.: 0.0 3rd Qu.: 309.44 3rd Qu.: 4.33 3rd Qu.:1 3rd Qu.: 0.0 3rd Qu.: 0.000
Max. :0 Max. :877.3 Max. :10036.4 Max. :1007.74 Max. :5523 Max. :8815.2 Max. :16197.08 Max. :17581.02 Max. :1 Max. :42387.0 Max. :4956.360
part_a_other_fluid inpatient_access outpatient_er_access part_a_other_access
Min. : 0.00 Min. : 0 Min. : 0.00 Min. : 0.00
1st Qu.: 0.00 1st Qu.: 0 1st Qu.: 0.00 1st Qu.: 0.00
Median : 0.00 Median : 0 Median : 0.00 Median : 0.00
Mean : 36.26 Mean : 281 Mean : 14.89 Mean : 92.67
3rd Qu.: 0.00 3rd Qu.: 0 3rd Qu.: 0.00 3rd Qu.: 0.00
Max. :14684.18 Max. :92109 Max. :11694.69 Max. :15409.49
summary(summary_wds$payment)
Min. 1st Qu. Median Mean 3rd Qu. Max.
34.18 2664.22 3284.65 6916.75 6039.45 139204.35
summary_wds %>%
ggplot(aes(x=payment)) +
geom_histogram(breaks = seq(500,100000,by=1000),
bins=20,
col="red",
fill = "green",
alpha = 0.2) +
scale_x_log10() +
labs(x="Payments", y = "Counts",title = "Payments Histogram")
change_increase<-summary_wds %>% select (patient_id,payment,dos_year,dos_month) %>% group_by(patient_id,dos_year,add=TRUE) %>% summarise(sum_payment = sum(payment), num_of_months = NROW(dos_month)) %>% ungroup %>% filter(num_of_months >11) %>% arrange (num_of_months,patient_id,dos_year,desc(sum_payment))
change_increase\(patient_id <- as.factor(change_increase\)patient_id)
change_increase %>% filter (patient_id == ‘403675’)
change_increase\(patient_id <- as.factor(change_increase\)patient_id ) change_increase\(dos_year <- as.factor(change_increase\)dos_year)
pl <- ggplot( change_increase,aes(y=sum_payment, x = patient_id ,fill=dos_year)) + geom_bar(stat = “identity”,position = ‘dodge’) + labs(x=“patient_id”, y= “Payments”) + ggtitle(“yearly payments difference for patients”)
pl
#pivot the data and understand hte percentage change in payments and get expensive patients expensive_patients<-pivot_wider(change_increase, names_from = dos_year, values_from = sum_payment, values_fill = list(sum_payment = 0)) %>% mutate(percent_change = (2018-2017)/2017 * 100) %>% filter(percent_change > 50 & 2017 != 0) %>% arrange (desc(percent_change))
expensive_patients
summary_wds %>%
filter(patient_id =='798094') %>%
arrange (dos_year,dos_month)
NA
library(dplyr)
year_plot<- summary_wds %>%
group_by(dos_year,dos_month) %>%
summarise(sum_pay = sum(payment)) %>%
ungroup()
year_plot
year_plot$dos_month <- as.factor(year_plot$dos_month)
year_plot$dos_year <- as.factor(year_plot$dos_year)
pl <- ggplot( year_plot,aes(y=sum_pay, x = dos_month ,fill=dos_year)) +
geom_bar(stat = "identity",position = 'dodge') +
labs(x="Months", y= "Payments") +
ggtitle("Increase in Payments from 2017 to 2018")
pl
NA
#redo this with non summarize raw data
plbox <- ggplot(year_plot,aes(y=sum_pay, x = dos_month,fill=dos_month) )+
geom_boxplot() +
labs(x="Months", y= "Payments") +
ggtitle("Distribution of Payments by Months") +
theme_dark()
plbox
NA
#redo this with non summarize raw data
dci_data_ws %>%
select(patient_id,modality,payment) %>%
filter(modality =='HIC') %>%
arrange(desc(payment)) %>%
head(n = 30) %>%
ggplot(aes(x=reorder(patient_id,payment),y=payment,fill=patient_id))+
geom_bar(stat="identity") +
coord_flip() +
labs(x="Patient ID",
y="Payments",
title = "Top 30 patients with high payments")
NA
NA
NA
NA
NA
NA
NA
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
summary_2017<-summary_wds %>%
group_by(dos_month,dos_year) %>%
summarise(
total_patients = NROW(patient_id),
totalpayments = sum(payment),
min_pay = min(payment),
max_pay = max(payment),
avg_pay = sum(payment)/NROW(patient_id)) %>%
ungroup() %>%
arrange(dos_year,dos_month) %>%
filter(dos_year =="2017")
summary_2017
summary_2017_pivot <- summary_2017 %>%
pivot_longer(min_pay:avg_pay, names_to = "payment_type", values_to = "payment")
summary_2017_pivot$dos_month = as.factor(summary_2017_pivot$dos_month)
summary_2017_pivot
NA
pl<- ggplot(summary_2017_pivot,aes(x=dos_month, y = payment, fill = payment_type)) +
geom_col(stat = "identity",position = 'dodge') +
geom_hline(yintercept = 6000, linetype = "dashed",color = "darkred") +
labs(x="Months", y = "Payments") +
ggtitle("2017 minimum, maximum and average payments")
Ignoring unknown parameters: stat
pl + scale_y_continuous(breaks = seq(0,130000,5000))
NA
NA
all_payments<-summary_wds %>%
group_by(dos_month) %>%
summarise(
total_patients = NROW(patient_id),
totalpayments = sum(payment),
min_pay = min(payment),
max_pay = max(payment),
avg_pay = sum(payment)/NROW(patient_id)) %>%
ungroup() %>%
pivot_longer(min_pay:avg_pay, names_to='payment_type',values_to = 'payments')
all_payments
#redo with box plot
pl_all<- ggplot(all_payments,aes(x=dos_month, y = payments, fill = payment_type)) +
geom_col(stat = "identity",position = 'dodge') +
scale_x_continuous(name ="Months",breaks = seq(1,12,1)) +
geom_hline(yintercept = 6000, linetype = "dashed",color = "darkred") +
labs(x="Months", y = "Payments") +
ggtitle("Minimum, Maximum and Average payments")
Ignoring unknown parameters: stat
pl_all + scale_y_continuous(breaks = seq(0,130000,5000))
ANALYSIS USING SUMMARY AND DETAIL DATA Get the data and clean it
#get detail and summary data and save it in a df
detail_raw_data<-read_csv ('data/claim_details.csv' )
Duplicated column names deduplicated: 'patient_id' => 'patient_id_1' [37]Parsed with column specification:
cols(
.default = col_double(),
esco_id = [31mcol_character()[39m,
bene_hic_num = [31mcol_character()[39m,
claim_first_dialysis_date = [31mcol_character()[39m,
claim_last_dialysis_date = [31mcol_character()[39m,
location_id = [31mcol_character()[39m,
start_date = [34mcol_datetime(format = "")[39m,
end_date = [34mcol_datetime(format = "")[39m,
esrd_date = [34mcol_datetime(format = "")[39m,
modality = [31mcol_character()[39m,
hgb_cutoff = [31mcol_character()[39m,
epo_ceiling = [31mcol_character()[39m,
route_freq = [31mcol_character()[39m,
drug_name = [31mcol_character()[39m,
ferritin_cutoff = [31mcol_character()[39m,
tsat_cutoff = [31mcol_character()[39m,
sup_name = [31mcol_character()[39m,
tx_epo = [31mcol_character()[39m,
venofer_given = [31mcol_character()[39m,
venofer_wasted = [31mcol_character()[39m,
ferrlecit_given = [31mcol_character()[39m
# ... with 42 more columns
)
See spec(...) for full column specifications.
1 parsing failure.
row col expected actual file
7272 esrd_date date like NULL 'data/claim_details.csv'
dci_data<-detail_raw_data
dci_data<-dci_data %>%
select(-esco_id,-bene_hic_num,-full_encounter,-phys_neph,-phys_hosp,-phys_ed,-ambulance,-phys_other,-esco_aligned_flag,-inpatient_fluid,-outpatient_er_fluid,-part_a_other_fluid,-patient_id_1,-dos_yyyy,-dos_mm,-esrd_date,-hgb_date,-tsat_date,-ferr_date,-albumin_date,-pth_date,-ca_date,-cca_date,-ph_date,-k_date,-urr_date,-ktv_date,-epo_given,-venofer_wasted,-inpatient_access,-outpatient_er_access,-part_a_other_access,-dci_claims,-non_dci_claims,-hha,-hospice,-dme,-start_date,-end_date,-epo_ceiling,-route_freq,-drug_name,-ferritin_cutoff,-tsat_cutoff,-hgb_cutoff,-ferrlecit_wasted,-zemplar_iv_wasted,-calcijex_iv_wasted,-feraheme_wasted,-hectorol_iv_wasted,-tx_missed,-sensipar_dispensed)
#make sure all the columns are in correct data types.
#change claims date columns to date datatype
date_columns <- c("claim_first_dialysis_date","claim_last_dialysis_date")
dci_data[date_columns] <- lapply(dci_data[date_columns],as.Date)
#change the other columns to factor
fac_columns <- c("patient_id","dos_year","location_id","dos_month","modality","tx","tx_epo","sup_name")
dci_data[fac_columns] <- lapply(dci_data[fac_columns],as.factor)
#change the below columns to logical
bool_columns <- c("epo_protocol_flag","iron_protocol_flag","nutsup_protocol_flag","hgb_exclude_flag","active_flag")
dci_data[bool_columns] <- lapply(dci_data[bool_columns],as.logical)
#Make below colums as logical true if they have any value else if they have null make it logical false.
dci_data<-dci_data %>%
mutate_at(vars("ferrlecit_given","feraheme_given","venofer_given","zemplar_iv_given","hectorol_iv_given","calcijex_iv_given","zemplar_or_given","hectorol_or_given","calcijex_iv_given","zemplar_or_given","hectorol_or_given","calcijex_or_given","activase_given","prostat_given","nepro_given","liquacel_given","has_catheter","aranesp_given","protinex_given","mircera_given","sensipar_given","parsabiv_given","protein_bar_given",
"ice_cream_given","gelatein_given"),
funs(case_when(.=="NULL" ~ FALSE,
TRUE ~ TRUE)))
#FIRST make the null VALUES in char col to zero's except for date columns
num_columns <- c("hgb","tsat","ferr","albumin","pth","ca","cca","ph","k","urr","ktv","tx_epo")
dci_data[num_columns]<-dci_data[num_columns]%>%
replace(.=="NULL","0")
#change all the character columns ot numeric
dci_data<-dci_data %>% mutate_if(is.character,as.numeric)
#names(dci_data)
#as for date columns we cannot replace na values ot 0. first make them character columns and then make rest to the na values to 0 in entire dataframe
dci_data$claim_first_dialysis_date <-as.character.Date(dci_data$claim_first_dialysis_date )
dci_data$claim_last_dialysis_date <-as.character.Date(dci_data$claim_last_dialysis_date )
#make all na values to 0 in entire dataframe
dci_data[is.na(dci_data)]<-0
# sum(is.na(dci_data))
#making the dates column back to date datatype
dci_data$claim_first_dialysis_date <- as.Date(dci_data$claim_first_dialysis_date)
dci_data$claim_last_dialysis_date <- as.Date(dci_data$claim_last_dialysis_date)
#delete the outliers( payments which are less than 1500)
dci_data<-dci_data %>%
filter(`payment`>1500) %>%
arrange(desc(payment))
#combine 2 separate part_b payments to one
dci_data<-dci_data %>%
mutate(part_b = part_b_phys + part_b_dme)
#save to RDS file
dci_data_ws<- dci_data
saveRDS(dci_data_ws,file="DCI_midcourse/data/dci_data.rds")
library(dplyr)
library(ggplot2)
# Step 1
gpl<-dci_data_ws %>%
#Step 2
group_by(modality) %>%
#Step 3
summarise(mean_payments= mean(payment)) %>%
#Step 4
ggplot(aes(x = modality, y = mean_payments, fill = modality)) +
geom_bar(stat = "identity") +
theme_classic() +
labs(
x = "Modality",
y = "Average Payments",
title = "Average Payments vs Modality"
)
gpl+ scale_x_discrete(labels=c("HH" = "Home Hemo", "HIC" = "Hemo In-Center",
"PD" = "Peritoneal"))
dci_data %>%
select(payment,part_a,part_b_phys,part_b_dme,part_b)
#sum(is.na(dci_data$claim_last_dialysis_date))
sum(is.na(dci_data))
[1] 220
str(dci_data)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 9970 obs. of 72 variables:
$ patient_id : Factor w/ 672 levels "30111","39732",..: 293 50 64 251 21 31 293 86 441 172 ...
$ dos_year : Factor w/ 2 levels "2017","2018": 1 1 2 1 2 2 1 2 2 2 ...
$ dos_month : Factor w/ 12 levels "1","2","3","4",..: 7 7 4 7 3 7 9 2 3 8 ...
$ claim_first_dialysis_date: Date, format: NA NA NA "2017-07-13" ...
$ claim_last_dialysis_date : Date, format: NA NA NA "2017-07-29" ...
$ payment : num 128637 122674 121244 112103 109108 ...
$ part_a : num 119013 119153 111345 110036 102586 ...
$ part_b_phys : num 9623 3521 9899 2067 6521 ...
$ part_b_dme : num 0 0 0 0 0 ...
$ inpatient : num 119013 119153 111345 108511 102586 ...
$ outpatient_dialysis : num 0 0 0 1525 0 ...
$ outpatient_er : num 0 0 0 0 0 ...
$ outpatient_other : num 0 0 0 0 0 ...
$ snf : num 0 0 0 0 0 ...
$ vasc_access : num 179 0 309 0 0 ...
$ location_id : Factor w/ 25 levels "000026","000055",..: 18 22 17 24 25 3 18 1 4 5 ...
$ modality : Factor w/ 3 levels "HH","HIC","PD": 2 2 2 2 2 2 2 2 2 2 ...
$ epo_protocol_flag : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ iron_protocol_flag : logi FALSE TRUE TRUE TRUE TRUE TRUE ...
$ nutsup_protocol_flag : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ sup_name : Factor w/ 11 levels "Body Quest Ice Cream",..: 4 5 9 4 4 11 4 11 11 1 ...
$ tx : Factor w/ 24 levels "0","1","2","3",..: 9 1 5 1 1 3 1 1 6 1 ...
$ tx_epo : Factor w/ 16 levels "0","1","10","11",..: 9 1 2 1 1 8 1 1 2 1 ...
$ venofer_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ferrlecit_given : logi TRUE FALSE FALSE FALSE FALSE FALSE ...
$ feraheme_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ zemplar_iv_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ hectorol_iv_given : logi TRUE FALSE TRUE FALSE FALSE TRUE ...
$ calcijex_iv_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ zemplar_or_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ hectorol_or_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ calcijex_or_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ activase_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ prostat_given : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ nepro_given : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ liquacel_given : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ hgb : num 11.1 0 8.6 0 0 10.4 0 0 0 0 ...
$ tsat : num 23 39 38 23 32 56 0 0 18 50 ...
$ ferr : num 2051 1401 1591 1037 667 ...
$ albumin : num 2.3 0 0 0 0 3.5 0 0 0 0 ...
$ pth : num 559 577 224 0 641 ...
$ ca : num 8.7 0 0 0 0 8.7 0 0 0 0 ...
$ cca : num 8.7 0 0 0 0 9.1 0 0 0 0 ...
$ ph : num 2.3 0 0 0 0 5 0 0 0 0 ...
$ k : num 4.9 0 0 0 0 5.1 0 0 0 0 ...
$ urr : num 72 0 70 0 0 78 0 0 0 0 ...
$ ktv : num 1.45 0 1.35 0 0 1.72 0 0 0 0 ...
$ has_catheter : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ aranesp_given : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ protinex_given : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ hgb_exclude_flag : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ mircera_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ sensipar_given : logi FALSE FALSE TRUE FALSE TRUE TRUE ...
$ parsabiv_given : logi FALSE FALSE TRUE FALSE TRUE TRUE ...
$ protein_bar_given : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
$ ice_cream_given : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
$ gelatein_given : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
$ active_flag : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ hospital_episodes : num 2 1 2 2 1 3 1 0 1 1 ...
$ hospital_admits : num 2 0 2 1 0 3 0 0 1 0 ...
$ hospital_discharges : num 1 0 1 1 0 3 0 0 0 0 ...
$ er_visits : num 0 0 0 0 0 0 0 0 1 0 ...
$ snf_episodes : num 0 0 0 0 0 1 0 1 1 0 ...
$ snf_admit : num 0 0 0 0 0 0 0 0 0 0 ...
$ snf_discharge : num 0 0 0 0 0 1 0 0 1 0 ...
$ med_orders : num 8 16 10 7 12 46 8 9 20 16 ...
$ cardio : num 2 5 2 2 3 8 2 2 1 3 ...
$ beta_blockers : num 1 1 0 1 1 1 1 1 1 1 ...
$ antihypertensives : num 0 1 0 0 0 3 0 0 0 0 ...
$ opioids : num 1 0 0 0 0 1 1 1 1 0 ...
$ antidiabetics : num 0 1 3 1 1 4 0 0 1 3 ...
$ part_b : num 9623 3521 9899 2067 6521 ...
Histograms to understand the distribution of the data for payments
dci_data_ws %>%
select (payment,part_a,part_b_phys,part_b_phys) %>%
ggplot(aes(x=payment) )+
geom_histogram(color="#e9ecef", alpha=0.6, position = 'identity', bins=40) +
scale_x_log10()+
scale_fill_manual(values=c("#69b3a2", "#404080")) +
labs(x="loggged payment values",
y="Counts",
title = "Payment distribution")
#make this interactive
p<- ggplot(dci_data_ws,aes(x=payment,y=..density..)) +
geom_histogram(bins=40,binwidth = 0.05,fill = "black",color="black",alpha=0.2) +
scale_x_log10()+
geom_density(color = "red")+
labs(x="Logged payments",
y="Frequency",
title = "Distribution of total payments")
ggplotly(p)
NA
NA
NA
#make this interactive
plt_mod<-dci_data_ws %>%
filter(dos_month == 6,dos_year==2017) %>%
ggplot(aes(x=payment,fill=modality)) +
geom_histogram(bins=10,binwidth = 0.25,alpha=0.8,position = "dodge") +
scale_x_log10()+
labs(x="loggged payment values",
y="Frequency",
title = "Payment distribution")
plt_mod
Modality distributions
#density distribution for modalities
ggplot(dci_data_ws,aes(payment, color = modality,fill=modality)) +
scale_x_log10()+
geom_density(alpha = 0.8,position = "dodge") +
labs(x="Payments",
y="Density",
title="Distribution on payments by modality")
dci_data_ws %>%
select(payment,modality) %>%
group_by(modality) %>%
summarise(total_payments=sum(payment),num_of_rows=n())
#Function to create an histgram for each kind of madality HIC,HH AND PD
make_plot <- function(mod){
dci_data_ws %>%
select(payment,modality) %>%
filter(modality == mod) %>%
ggplot(aes(x=payment)) +
geom_histogram(binwith=1000) +
scale_x_log10() +
labs(x="payments",
y="frequency",
title = paste("Distribution of",mod))
}
make_plot('HH')
Ignoring unknown parameters: binwith
make_plot('HIC')
Ignoring unknown parameters: binwith
make_plot('PD')
Ignoring unknown parameters: binwith
dci_data_ws %>%
select(payment,modality) %>%
filter(modality == 'HH') %>%
ggplot(aes(x=payment)) +
geom_histogram() +
labs(x="payments",
y="frequency",
title = "Distribution of Hemo Home Payments")
NA
dci_data_ws %>%
select(payment,modality) %>%
#filter(payment>2800 & payment <3500) %>%
group_by(modality)
NA
NA
NA
NA
dci_data_ws %>%
select(payment,modality) %>%
filter(payment>2800 & payment <3500) %>%
#group_by(modality) %>%
filter(modality=='PD') %>%
ggplot(aes(x=payment)) +
geom_histogram(binwidth = 4000)
dci_data_ws %>%
select(payment,modality) %>%
filter(payment>2800 & payment <3500)
library(reshape2)
dci_long <- reshape2::melt(dci_data_ws)
ggplot(dci_long, aes(value)) + facet_wrap(~variable, scales = 'free_x') +
geom_histogram()
#frequency distribution with density lines
#x<- sample(0:30, 200, replace=T, prob=15 - abs(15 - 0:30))
x<-dci_data_ws$payment
## Calculate and plot the two histograms
hcum <- h <- hist(x, plot=FALSE)
hcum$counts <- cumsum(hcum$counts)
plot(hcum, main="")
plot(h, add=T, col="grey")
## Plot the density and cumulative density
d <- density(x)
lines(x = d$x, y = d$y * length(x) * diff(h$breaks)[1], lwd = 2)
lines(x = d$x, y = cumsum(d$y)/max(cumsum(d$y)) * length(x), lwd = 2)
#distribution by payment type
payment_type_dist<-dci_data_ws %>%
select(dos_year,dos_month,payment,part_a,part_b,inpatient,outpatient_dialysis)%>%
pivot_longer(payment:outpatient_dialysis,names_to="payment_type",values_to = "payment")
saveRDS(payment_type_dist,file="DCI_midcourse/data/payment_type_dist.rds")
#fat tailed dis.
box_pl<-payment_type_dist %>%
filter(dos_year == 2017 & dos_month == 1) %>%
ggplot(aes(y = payment,x = payment_type,fill=payment_type)) +
geom_boxplot()+
# scale_y_log10() +
theme_classic() +
labs(x="Payment type", y = "Payment",
title = "Payment distribution by payment types")
ggplotly(box_pl)
#plot the fequency distribution across different payment types
pl<-dci_data_ws %>%
select(dos_year,dos_month,location_id,payment,inpatient,outpatient_dialysis)%>%
pivot_longer(payment:outpatient_dialysis,names_to="payment_type",values_to = "payment") %>%
filter(dos_year==2017 & dos_month==1) %>%
ggplot( aes(x=payment,fill=payment_type,color= payment_type)) +
geom_histogram(bins= 40,binwidth = 0.05,alpha=0.5,position = "identity") +
scale_x_continuous(breaks = seq(0,130000,5000)) +
scale_x_log10() +
geom_vline(aes(xintercept = mean(payment,na.rm=T)),color = "red", linetype = "dashed",size = 1) +
labs(x="Logged payments",y="Frequency",title = "Payments: Inpatient/Hospitalization or Outpatient")
Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
pl
sum(is.na(dci_data$payment))
years_plot<- dci_data_ws %>%
group_by(dos_year,dos_month) %>%
summarise(sum_pay = sum(payment)) %>%
ungroup()
pl <- ggplot( years_plot,aes(y=sum_pay, x = dos_month ,fill=dos_year)) +
geom_bar(stat = "identity",position = 'dodge') +
labs(x="Months", y= "Payments") +
ggtitle("Total payments increase in each month")
pl
years_plot
NA
ggplotly(dci_data_ws %>%
group_by(modality,dos_month,dos_year) %>%
summarise(payments = sum(payment)) %>%
ungroup() %>%
# ggplot(aes(x=modality,y=payments,fill = `dos_year`)) +
ggplot(aes(x=dos_month,y=payments,fill = `modality`)) +
geom_bar(stat="identity",position="dodge" ) )
NA
NA
#show how modalities are doing by total payments
modality_pl<-dci_data_ws %>%
group_by(dos_year,modality) %>%
summarise(payments = sum(payment)) %>%
ungroup() %>%
ggplot(aes(x=dos_year,y=payments,fill=modality,color=modality))+
geom_bar(stat="identity",position="dodge") +
geom_text(aes(label = round(payments),vjust=0),postion=position_dodge(width=5)) +
labs(x="Modalities in 2017 and 2018",y="payments",title="Payments by modality")
Ignoring unknown parameters: postion
modality_pl
dci_data_ws %>%
ggplot(aes(x=dos_year,y=payment,color = modality)) +
geom_boxplot() +
scale_y_log10() +
labs(x = "Years",y="payments",
title = "Payments in in 2017 and 2018")
#Top 50 most expensive patients
Top50_expensive_patients<-head(arrange(dci_data_ws,desc(payment)), n = 50)
plt_50<-Top50_expensive_patients %>%
#mutate(exp_patient = max(payment)) %>%
ggplot(aes(y=payment,x=patient_id, fill=modality)) +
geom_bar(stat = "identity" ) +
theme_bw()+
labs(x="Patient ID's",
y="Payments",
title="50 expensive patients")
plt_50 + theme(axis.text.x = element_text(angle = 90, hjust = 1))
Percentage bars
library(plotly)
set.seed(123)
df <- dci_data_ws %>%
select(payment,inpatient,outpatient_dialysis,modality) %>%
pivot_longer(inpatient:outpatient_dialysis,names_to = "hospitalization",values_to = "payments")
p <- ggplot(df, aes(x = modality)) +
geom_bar(aes(y = ..count../sum(..count..), fill = hospitalization)) +
scale_fill_brewer(palette = "Set3") +
ylab("Percent") +
ggtitle("Show precentages in bar chart")
p <- ggplotly(p)
p
NA
dci_data_ws %>%
select(payment,inpatient,outpatient_dialysis,modality) %>%
group_by(modality) %>%
summarise(payment=sum(payment), inpatient=sum(inpatient),outpatient=sum(outpatient_dialysis)) %>%
ungroup() %>%
pivot_longer(inpatient:outpatient,names_to = "payType",values_to = "pays") %>%
ggplot(aes(x=modality,y=payment,fill=payType))+
geom_bar(stat="identity",position="stack")
other_columns <- c("payment","hospital_episodes","er_visits","cardio","opioids","antidiabetics","beta_blockers","antihypertensives","snf_discharge")
corrs_others <- dci_data_ws %>% select(other_columns) %>%
cor()
corrs_others
payment hospital_episodes er_visits cardio opioids antidiabetics beta_blockers antihypertensives snf_discharge
payment 1.000000000 0.37753105 0.081022685 -0.002692458 0.10955789 0.068742739 0.012452010 -0.039596398 0.104974700
hospital_episodes 0.377531051 1.00000000 0.138645269 0.033031308 0.10900633 0.045678897 0.037979095 -0.011190044 0.176735065
er_visits 0.081022685 0.13864527 1.000000000 0.013565237 0.05116460 0.003286641 0.018862131 -0.002438958 0.030687705
cardio -0.002692458 0.03303131 0.013565237 1.000000000 -0.04761030 0.203589325 0.490667836 0.656388881 -0.008774101
opioids 0.109557887 0.10900633 0.051164599 -0.047610303 1.00000000 -0.035631457 -0.064462712 -0.034940410 0.079861605
antidiabetics 0.068742739 0.04567890 0.003286641 0.203589325 -0.03563146 1.000000000 0.089209911 0.022751965 0.032889875
beta_blockers 0.012452010 0.03797909 0.018862131 0.490667836 -0.06446271 0.089209911 1.000000000 0.198152182 -0.004428954
antihypertensives -0.039596398 -0.01119004 -0.002438958 0.656388881 -0.03494041 0.022751965 0.198152182 1.000000000 -0.013584271
snf_discharge 0.104974700 0.17673506 0.030687705 -0.008774101 0.07986161 0.032889875 -0.004428954 -0.013584271 1.000000000
# Change background color to lightblue
corrplot(corrs_others, type = "upper", order = "hclust",
col = c("black", "white"), bg = "lightblue")
head(dci_data_ws)
#ask michael about how to plot this
dci_data_ws[,6:36] %>%
select(-part_a,-part_b_phys,-outpatient_other,-part_b_dme,-inpatient,-outpatient_dialysis,-outpatient_er,-snf,-vasc_access,-epo_protocol_flag,-iron_protocol_flag,-nutsup_protocol_flag,-sup_name,-tx,-tx_epo)
NA
correlational plot
num_columns <- c("hgb","tsat","ferr","albumin","pth","ca","cca","ph","k","urr","ktv")
corrs<-dci_data_ws %>% select(num_columns) %>%
# drop_na_() %>%
cor()
corrs
hgb tsat ferr albumin pth ca cca ph k urr ktv
hgb 1.00000000 0.0527380941 0.02570081 0.59609612 0.045803409 0.58340275 0.56441343 0.3008968876 0.4724638 0.21430294 0.247192331
tsat 0.05273809 1.0000000000 0.39986167 0.08697933 -0.027163707 0.04433107 0.03910477 0.0002758743 0.0788412 0.06797837 0.087991355
ferr 0.02570081 0.3998616659 1.00000000 0.05098390 0.005239110 0.05485813 0.05667298 -0.0514526023 0.1183593 0.20079170 0.064636183
albumin 0.59609612 0.0869793324 0.05098390 1.00000000 0.104339811 0.76287074 0.67532722 0.3997004606 0.6248656 0.31227746 0.270849884
pth 0.04580341 -0.0271637069 0.00523911 0.10433981 1.000000000 0.03478453 0.01573087 0.2972633464 0.1093904 0.02051048 0.004094452
ca 0.58340275 0.0443310656 0.05485813 0.76287074 0.034784525 1.00000000 0.98442825 0.3359632961 0.6256086 0.27040412 0.303778243
cca 0.56441343 0.0391047722 0.05667298 0.67532722 0.015730870 0.98442825 1.00000000 0.3196373678 0.6140480 0.25157942 0.306643666
ph 0.30089689 0.0002758743 -0.05145260 0.39970046 0.297263346 0.33596330 0.31963737 1.0000000000 0.4445911 0.08190045 0.076739150
k 0.47246376 0.0788412002 0.11835935 0.62486563 0.109390410 0.62560864 0.61404796 0.4445910518 1.0000000 0.34881524 0.209681668
urr 0.21430294 0.0679783689 0.20079170 0.31227746 0.020510483 0.27040412 0.25157942 0.0819004513 0.3488152 1.00000000 0.080500799
ktv 0.24719233 0.0879913547 0.06463618 0.27084988 0.004094452 0.30377824 0.30664367 0.0767391501 0.2096817 0.08050080 1.000000000
num_columns <- c("payment","hgb","tsat","ferr","albumin","pth","ca","cca","ph","k","urr","ktv")
corrs<-dci_data_ws %>% select(num_columns) %>%
cor()
corrplot(corrs,type = "upper",order = "hclust",
tl.col="black",tl.srt=45)
NA
NA
tibble('variable' = corrs[1,2:12] %>% names(),'correlation' = corrs[1,2:12]) %>%
ggplot(aes(x=reorder(variable,correlation),y = correlation)) +
geom_point()+
geom_segment(aes(xend=variable,yend=0))+
coord_flip() +
geom_hline(yintercept = 0) +
labs(x ="Labs",
y ="Correlation",
title = "Correlation between Lab values and Payments")
pl <- ggplot(dci_data_ws, aes(x=albumin,y=payment) ) +
geom_point(alpha=0.2) + geom_smooth(method = 'lm') +
scale_x_log10() +
scale_y_log10()
ggplotly(pl)
Transformation introduced infinite values in continuous x-axisTransformation introduced infinite values in continuous x-axisRemoved 195 rows containing non-finite values (stat_smooth).
#unique(dci_data_ws$ktv)
#hgb vs payment
ggplot(dci_data_ws, aes(x=hgb,y=payment) ) +
geom_point() + geom_smooth(method = 'lm') +
scale_x_log10() +
labs(x="hgb", y = "payment", title = "Payment Vs hgb")
ggplot(dci_data_ws, aes(x=ca,y=payment) ) +
geom_point() + geom_smooth(method = 'lm') +
scale_x_log10() +
labs(x="ca", y = "payment", title = "Payment Vs ca")
Maps
#first prep the dataset to get the information on the map markers
payment_summary_by_loc<-dci_data_ws %>%
select(patient_id,location_id,dos_month,dos_year,payment,part_a,part_b_phys,part_b_dme,modality) %>%
group_by(location_id,dos_month,dos_year ) %>%
summarise(
total_patients = NROW(patient_id),
totalpayments = round(sum(payment),digits=2),
min_pay = min(payment),
max_pay = max(payment),
avg_pay = sum(payment)/NROW(patient_id)) %>%
ungroup() %>%
arrange(dos_year,dos_month)
#merge the grouped data with location dataset
payment_geom_summary <- merge(payment_summary_by_loc,locations,by = "location_id")
payment_geom_summary %>%
filter(location_id =='000055' & dos_month ==1 & dos_year ==2017)
#save it to rds file
saveRDS(payment_geom_summary, file = "DCI_midcourse/data/payment_geom_summary.rds")
payment_geom_summary %>%
arrange(desc(totalpayments)) %>%
filter(dos_year==2017 & dos_month == 1)
NA
#Top 5 locations with high average payments
loc_plt<-payment_summary_by_loc %>%
arrange(desc(totalpayments)) %>%
filter(dos_year==2017 & dos_month == 4) %>%
top_n(5) %>%
ggplot(aes(x=location_id,y=avg_pay)) +
geom_col( )
Selecting by avg_pay
ggplotly(loc_plt)
NA
NA
leaflet(data= locations) %>%
addTiles() %>%
addMarkers(~longitude,
~latitude,
popup = ("hello"))
pl <- ggplot(dci_data_ws, aes(x=albumin,y=payment) ) +
geom_point(alpha=0.2) + geom_smooth(method = 'lm') +
scale_x_log10() +
scale_y_log10()
ggplotly(pl)
dci_data_ws %>%
select(dos_year,dos_month,payment,part_a,part_b) %>%
filter(dos_year==2017,dos_month==1) %>%
pivot_longer(part_a:part_b,names_to = "payType",values_to = "payments") %>%
ggplot(aes(x=payType,y=payments,fill=payments))+
geom_bar(stat="identity",position="dodge")+
labs(x="Payment Types",
y="Payments",
title = "Payments by Payment types")
NA
NA
NA
NA
NA
#Payments by Modalities
show_plot<-dci_data_ws %>%
select(dos_year,dos_month,payment,part_a,part_b,modality) %>%
filter(dos_year==2018,dos_month==1) %>%
ggplot(aes(x=modality,y=payment,fill=payment))+
geom_bar(stat="identity",position="dodge",alpha=0.3)+
labs(x="Modalities",
y="Payments",
title = "Payments by Modalities")
show_plot + scale_x_discrete(labels=c("HH" = "Home Hemo", "HIC" = "Hemo In-Center",
"PD" = "Peritoneal"))
dci_data_ws %>%
select(dos_year,dos_month,payment,part_a,part_b) %>%
mutate(percent_part_a = round(part_a/payment*100)) %>%
filter(dos_year==2017,dos_month==1)
NA
dci_data_shiny %>%
select (location_id,patient_id,payment,part_a,part_b,modality)
NA